How to characterize different groups of visitors? Is there a pattern in stations that visitor tends to visit?
library(tidyr)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
##
## Następujące obiekty zostały zakryte z 'package:stats':
##
## filter, lag
##
## Następujące obiekty zostały zakryte z 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
library(reshape)
##
## Attaching package: 'reshape'
##
## Następujący obiekt został zakryty z 'package:lubridate':
##
## stamp
##
## Następujący obiekt został zakryty z 'package:dplyr':
##
## rename
##
## Następujący obiekt został zakryty z 'package:tidyr':
##
## expand
library(ggplot2)
library(MASS)
##
## Attaching package: 'MASS'
##
## Następujący obiekt został zakryty z 'package:dplyr':
##
## select
library(cluster)
library(pvclust)
library(dendextend)
##
## Welcome to dendextend version 1.1.2
##
## Type ?dendextend to access the overall documentation and
## browseVignettes(package = 'dendextend') for the package vignette.
## You can execute a demo of the package via: demo(dendextend)
##
## More information is available on the dendextend project web-site:
## https://github.com/talgalili/dendextend/
##
## Contact: <tal.galili@gmail.com>
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
##
## To suppress the this message use:
## suppressPackageStartupMessages(library(dendextend))
##
##
## Attaching package: 'dendextend'
##
## Następujący obiekt został zakryty z 'package:dplyr':
##
## %>%
##
## Następujący obiekt został zakryty z 'package:tidyr':
##
## %>%
##
## Następujący obiekt został zakryty z 'package:stats':
##
## cutree
library(ape)
##
## Attaching package: 'ape'
##
## Następujące obiekty zostały zakryte z 'package:dendextend':
##
## ladderize, rotate
library(RColorBrewer)
library(scales)
library(colorspace) # get nice colors
library(plotly)
##
## Attaching package: 'plotly'
##
## Następujący obiekt został zakryty z 'package:dendextend':
##
## %>%
##
## Następujący obiekt został zakryty z 'package:ggplot2':
##
## last_plot
##
## Następujący obiekt został zakryty z 'package:graphics':
##
## layout
library(stringdist)
load("verySmallLogs.rda")
#Transforming data in the same way as in phase 1
data <- verySmallLogs %>%
mutate(station,
visitor,
type,
date,
weekday = wday(date, label=TRUE, abbr=FALSE),
hour = substr(date, 12, 13))
data$date <- as.POSIXct(data$date,format="%Y-%m-%d %H:%M:%S")
data = data[data$visitor != -1,]
dataEntering <- data[data$type=="Entering" & data$visitor != -1,]
dataLeaving <- data[data$type=="Leaving" & data$visitor != -1,]
newdataEntering = dataEntering %>%
group_by(visitor, station) %>%
summarise(min_date=min(date),
weekday=head(weekday,1),
hour=head(hour,1),
count = n())
newdataLeaving = dataLeaving %>%
group_by(visitor, station) %>%
summarise(max_date=max(date),
weekday=head(weekday,1),
hour=head(hour,1),
count = n())
mergedData <- merge(newdataEntering, newdataLeaving, by=c("visitor", "station"))
mergedData$time = as.numeric(mergedData$max_date-mergedData$min_date)
mergedData$hour = mergedData$hour.x
mergedData$weekday = mergedData$weekday.x
mergedData$count = mergedData$count.x
mergedData = mergedData[mergedData$time < 1000,]
Try to get data by visitor (not by visitor, station) and introduce new features that we tried to use in our clustering
As a distance metric we will use deafault distance between vectors used by kmeans, of selected features from those above. To cluster our data in this part we will use kmeans algorithm.
phase1Data <- mergedData[,c("visitor", "station", "max_date", "min_date", "time", "weekday", "hour", "count")]
phase1Data = phase1Data %>%
group_by(visitor) %>%
summarise(max_date=max(max_date),
min_date=min(min_date),
total_time=sum(time),
min_time=min(time),
max_time=max(time),
weekday=head(weekday,1),
hour=head(hour,1),
total_count = sum(count),
max_count = max(count),
min_count = min(count),
most_freq_station = head(station[which(count == max(count))],1),
least_freq_station = head(station[which(count == min(count))],1))
sampleData <- phase1Data[sample(nrow(phase1Data), 20000),]
sampleData <- sampleData[order(sampleData$visitor),]
rownames(sampleData) <- sampleData$visitor
kmeansData <- transform(sampleData,
visitor = as.numeric(visitor),
max_date = as.POSIXlt(max_date)$hour + as.POSIXlt(max_date)$min/60,
min_date = as.POSIXlt(min_date)$hour + as.POSIXlt(min_date)$min/60,
total_time = as.numeric(total_time),
min_time = as.numeric(min_time),
max_time = as.numeric(max_time),
hour = as.numeric(hour),
label = visitor)
kmeansData$max_date <- scale(kmeansData$max_date)
kmeansData$min_date <- scale(kmeansData$min_date)
kmeansData$total_time <- scale(kmeansData$total_time)
kmeansData$min_time <- scale(kmeansData$min_time)
kmeansData$max_time <- scale(kmeansData$max_time)
kmeansData$total_count <- scale(kmeansData$total_count)
kmeansData$max_count <- scale(kmeansData$max_count)
kmeansData$min_count <- scale(kmeansData$min_count)
Plot a SPLOM: (how features depend of each other)
SPLOM_DATA <- kmeansData[,c("total_time", "max_time", "min_time","min_date", "max_date", "min_count", "max_count", "total_count")]
station_col <- rev(rainbow_hcl(65))[as.numeric(sampleData$most_freq_station)]
pairs(SPLOM_DATA, col = station_col,
lower.panel = NULL,
cex.labels=1, pch=15, cex = 0.75)
We can see that most variance is introduced by total_count and min_data or total_time variables. So we will try to cluster our data using them as main features. We also will visualize our data mostly using total_count and total_time.
pc <- prcomp(SPLOM_DATA)
biplot(pc, xlabs=rep("·", nrow(SPLOM_DATA)))
Low min_date or max_date means that these are users playing mostly in morning ours, while higher values indicates user playing more on evening hours. We will later try to gather more detailed info about visitors in that classes.
set.seed(4)
model1 <- kmeans(kmeansData[,c("total_time", "max_date", "min_date", "total_count")], 3)
kmeansData$cluster <- factor(model1$cluster)
nd <- data.frame(model1$centers)
ggplot(kmeansData, aes(total_time, total_count)) +
geom_text(size=3, aes(label=most_freq_station, color=cluster)) +
geom_point(data=nd, size=3)+
theme_bw()
ggplot(kmeansData, aes(total_time, max_date)) +
geom_text(size=3, aes(label=most_freq_station, color=cluster)) +
geom_point(data=nd, size=3)+
theme_bw()
ggplot(kmeansData, aes(total_time, min_date)) +
geom_text(size=3, aes(label=most_freq_station, color=cluster)) +
geom_point(data=nd, size=3)+
theme_bw()
Firstly we tried to analyse visitors clustered by total_time and total_count features, splitting them to groups of visitors how play long and do many iterations and those to play shorter and do less interactions. In order to do that we cluster in 4 groups and analyze what stations people in every group mostly use.
set.seed(4)
model1 <- kmeans(kmeansData[,c("total_time", "total_count")], 4)
kmeansData$cluster <- factor(model1$cluster)
nd <- data.frame(model1$centers)
ggplot(kmeansData, aes(total_time, total_count)) +
geom_text(size=3, aes(label=most_freq_station, color=cluster)) +
geom_point(data=nd, size=3)+
theme_bw()
kmFirstGroup = kmeansData[kmeansData$cluster == 1,]
kmSecondGroup = kmeansData[kmeansData$cluster == 2,]
kmThirdGroup = kmeansData[kmeansData$cluster == 3,]
kmFourthGroup = kmeansData[kmeansData$cluster == 4,]
Analysis of most_frequent station with data clustered in 4 groups, we can see here that usage of particular stations in those groups are very different. For example a short playing group uses a lot station cnk56 which is nearly not used by medium playing visitors. Lond time players choose cnk18 machine which is not very popular in the rest of groups. Also we can conclude more situations like that.
table1 = table(as.character(kmFirstGroup$most_freq_station))
table2 = table(as.character(kmSecondGroup$most_freq_station))
table3 = table(as.character(kmThirdGroup$most_freq_station))
table4 = table(as.character(kmFourthGroup$most_freq_station))
table = c(table1, table2, table3, table4)
resultTab <- matrix(table, ncol=4, nrow = length(table1), byrow = TRUE)
## Warning in matrix(table, ncol = 4, nrow = length(table1), byrow = TRUE):
## długość danych [34] nie jest pod-wielokrotnością lub wielokrotnością liczby
## wierszy [9]
rownames(resultTab) <- names(table1)
colnames(resultTab) <- c("1", "2", "3", "4")
resultTab
## 1 2 3 4
## cnk05 63 30 6 2
## cnk10 12 52 58 18
## cnk18 68 84 76 183
## cnk19a 313 91 17 67
## cnk20 108 64 10 15
## cnk38 4 17 24 4
## cnk56 7 125 47 27
## cnk61 14 103 73 77
## cnk66 36 105 63 30
par(mar = c(5, 4, 1.5, 0.5), ps = 12, cex = 1, cex.main = 2, las = 1)
barplot(
resultTab,
beside = TRUE,
axes = TRUE,
axis.lty = 1,
col = c("cornflowerblue", "grey", "deepskyblue1", "cornsilk3", "darkgoldenrod4", "darkseagreen3", "bisque3", "coral3", "cyan4"),
panel.first = abline(
h = seq.int(25, 100, 25),
col = "grey",
lty = 2
)
)
legend("topright",
legend = names(table1),
fill = c("cornflowerblue", "grey", "deepskyblue1", "cornsilk3", "darkgoldenrod4", "darkseagreen3", "bisque3", "coral3", "cyan4"))
We can find analogous situation if we analyse a station which is least used by clustered groups of visitors. We can see that distributions of least used station is different in every group.
table1 = table(as.character(kmFirstGroup$least_freq_station))
table2 = table(as.character(kmSecondGroup$least_freq_station))
table3 = table(as.character(kmThirdGroup$least_freq_station))
table4 = table(as.character(kmFourthGroup$least_freq_station))
result <- rep(0, length(table1))
where <- match( names(table4), names(table1) )
result[ where ] <- table4
table4 = result
table = c(table1, table2, table3, table4)
resultTab <- matrix(table, ncol=4, nrow = length(table1))
## Warning in matrix(table, ncol = 4, nrow = length(table1)): długość danych
## [35] nie jest pod-wielokrotnością lub wielokrotnością liczby wierszy [9]
rownames(resultTab) <- names(table1)
colnames(resultTab) <- c("1", "2", "3", "4")
resultTab
## 1 2 3 4
## cnk05 17 63 4 86
## cnk10 37 95 11 73
## cnk18 41 108 8 225
## cnk19a 131 467 30 37
## cnk20 24 61 5 25
## cnk38 5 15 4 49
## cnk56 14 52 11 46
## cnk61 27 89 8 36
## cnk66 13 53 30 17
par(mar = c(5, 4, 1.5, 0.5), ps = 12, cex = 1, cex.main = 2, las = 1)
barplot(
resultTab,
beside = TRUE,
axes = TRUE,
axis.lty = 1,
col = c("cornflowerblue", "grey", "deepskyblue1", "cornsilk3", "darkgoldenrod4", "darkseagreen3", "bisque3", "coral3", "cyan4"),
panel.first = abline(
h = seq.int(25, 100, 25),
col = "grey",
lty = 2
)
)
legend("topright",
legend = names(table1),
fill = c("cornflowerblue", "grey", "deepskyblue1", "cornsilk3", "darkgoldenrod4", "darkseagreen3", "bisque3", "coral3", "cyan4"))
On the other hand all distributions based on hour visitors play are nearly normal (see normal distribution).
table3 = table(as.character(kmThirdGroup$hour))
table1 = table(as.character(kmFirstGroup$hour))
result <- rep(0, length(table3))
where <- match( names(table1), names(table3) )
result[ where ] <- table1
table1 = result
table2 = table(as.character(kmSecondGroup$hour))
result <- rep(0, length(table3))
where <- match( names(table2), names(table3) )
result[ where ] <- table2
table2 = result
table4 = table(as.character(kmFourthGroup$hour))
result <- rep(0, length(table3))
where <- match( names(table4), names(table3) )
result[ where ] <- table4
table4 = result
table = c(table1, table2, table3, table4)
resultTab <- matrix(table, ncol=4, nrow = length(table1))
rownames(resultTab) <- names(table3)
colnames(resultTab) <- c("1", "2", "3", "4")
resultTab
## 1 2 3 4
## 10 22 54 9 35
## 11 33 115 10 63
## 12 46 144 15 74
## 13 36 139 14 81
## 14 61 149 7 95
## 15 35 140 8 91
## 16 34 161 8 92
## 17 26 75 5 52
## 18 11 20 3 16
## 9 5 6 2 8
par(mar = c(5, 4, 1.5, 0.5), ps = 12, cex = 1, cex.main = 2, las = 1)
barplot(
resultTab,
beside = TRUE,
axes = TRUE,
axis.lty = 1,
col = c("cornflowerblue", "grey", "deepskyblue1", "cornsilk3", "darkgoldenrod4", "darkseagreen3", "bisque3", "coral3", "cyan4"),
panel.first = abline(
h = seq.int(25, 100, 25),
col = "grey",
lty = 2
)
)
legend("topright",
legend = names(table3),
fill = c("cornflowerblue", "grey", "deepskyblue1", "cornsilk3", "darkgoldenrod4", "darkseagreen3", "bisque3", "coral3", "cyan4"))
In order to find some pattern of stations visited by a user, we introduced new features and distance metric:
Here as a distance metric we have to use some distance between strings, we use restricted Damerau-Levenshtein distance.
patternData <- mergedData[,c("visitor", "station", "time", "count", "min_date")]
patternData = patternData %>%
arrange(min_date) %>%
group_by(visitor) %>%
summarise(total_time=sum(time),
total_count = sum(count),
first_station = head(station,1),
last_station = tail(station,1),
station_path = paste(station, collapse="_"),
most_freq_station = head(station[which(count == max(count))],1),
least_freq_station = head(station[which(count == min(count))],1))
sampleData <- patternData[sample(nrow(patternData), 20000),]
sampleData <- sampleData[order(sampleData$visitor),]
rownames(sampleData) <- sampleData$visitor
Now we will cluster our data into 4 groups using hierarchical clustering (hclust):
d <- stringdistmatrix(sampleData$station_path, sampleData$station_path)
cl <- hclust(as.dist(d))
sampleData$labels = factor(cutree(cl, k=4))
ggplot(sampleData, aes(total_count, total_time, label=most_freq_station, color=labels))+geom_text(size=3)+theme_bw()
ggplot(sampleData, aes(total_count, total_time, color=labels))+geom_point(size=2)+theme_bw()
firstGroup = (sampleData %>% filter(labels == 1))
secondGroup = (sampleData %>% filter(labels == 2))
thirdGroup = (sampleData %>% filter(labels == 3))
fourthGroup = (sampleData %>% filter(labels == 4))
That way of clustering provides grouping visitors with similar behaviour together, for example with similar starting and ending station. However short playing visitors seem to nearly randomly choose their first station which is usually also their last one.
q1 <- ggplot(sampleData, aes(total_count, first_station, color=labels))+geom_point(size=2)+theme_bw()
q2 <- ggplot(sampleData, aes(total_count, last_station, color=labels))+geom_point(size=2)+theme_bw()
multiplot(q1, q2, cols=2)
As before we will try to compare distributions of most and least frequent stations in groups described by our clustering. We will also show total time distribution shown in every group (notice that we are not clustering on total_time this time).
q1 <- qplot(firstGroup$total_time)+geom_histogram(bins = 15)
q2 <- qplot(secondGroup$total_time)+geom_histogram(bins = 15)
q3 <- qplot(thirdGroup$total_time)+geom_histogram(bins = 15)
q4 <- qplot(fourthGroup$total_time)+geom_histogram(bins = 15)
multiplot(q1, q2, q3, q4, cols=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
What we can see here that clustering divided our visitors to groups containing players with different characteristics of time spend on stations. As before we can see a group of players playing long (more than 500) and shortly (less than 500). We see that all people with very short total time are clustered to one group. This can mean that we clustered together people who used our stations “accidentaly” or “sporadically”. This is the kind of visitors which comes, play in one or to machines and go “home”. Next we will analize that data from most frequently used station point of view:
q1 <- qplot(firstGroup$most_freq_station)+geom_bar()
q2 <- qplot(secondGroup$most_freq_station)+geom_bar()
q3 <- qplot(thirdGroup$most_freq_station)+geom_bar()
q4 <- qplot(fourthGroup$most_freq_station)+geom_bar()
multiplot(q1, q2, q3, q4, cols=2)
From the distributions of stations usage by groups of visitors we can see that a group of short players behaves much different than a group of long players which can be considered as common/constant visitors (we can consider them as experianced visitors). Short players use mostly cnk19a which is nearly never used by “long players”. However the more visitor play the more often he chooses cnk05 and cnk56 stations instead of cnk19a, cnk20 or cnk10.
head(firstGroup[,c("station_path")])
## Source: local data frame [6 x 1]
##
## station_path
## (chr)
## 1 cnk61
## 2 cnk10
## 3 cnk10_cnk20_cnk56
## 4 cnk05_cnk10_cnk66_cnk20
## 5 cnk38_cnk61
## 6 cnk61_cnk56
head(secondGroup[,c("station_path")])
## Source: local data frame [6 x 1]
##
## station_path
## (chr)
## 1 cnk10_cnk61_cnk66_cnk56_cnk05
## 2 cnk05_cnk10_cnk66_cnk18_cnk61_cnk38
## 3 cnk10_cnk05_cnk66_cnk20_cnk38
## 4 cnk10_cnk66_cnk20_cnk61_cnk56
## 5 cnk05_cnk66_cnk20_cnk61_cnk56
## 6 cnk10_cnk66_cnk18_cnk61_cnk38
head(thirdGroup[,c("station_path")])
## Source: local data frame [6 x 1]
##
## station_path
## (chr)
## 1 cnk10_cnk05_cnk66_cnk20_cnk38_cnk56_cnk61
## 2 cnk10_cnk05_cnk66_cnk20_cnk18_cnk56_cnk38
## 3 cnk10_cnk05_cnk20_cnk66_cnk38_cnk56_cnk18_cnk61
## 4 cnk10_cnk05_cnk20_cnk66_cnk38_cnk56_cnk18
## 5 cnk66_cnk18_cnk38_cnk56_cnk05_cnk10_cnk61_cnk20
## 6 cnk10_cnk05_cnk66_cnk18_cnk20_cnk61_cnk56
head(fourthGroup[,c("station_path")])
## Source: local data frame [6 x 1]
##
## station_path
## (chr)
## 1 cnk05_cnk10_cnk66_cnk20_cnk18_cnk61_cnk56_cnk38
## 2 cnk10_cnk05_cnk66_cnk20_cnk18_cnk61_cnk56_cnk38
## 3 cnk19a_cnk66_cnk18_cnk56_cnk61_cnk38_cnk20_cnk05_cnk10
## 4 cnk19a_cnk05_cnk20_cnk10_cnk66_cnk61_cnk56_cnk38
## 5 cnk10_cnk19a_cnk18_cnk05_cnk20_cnk38_cnk56_cnk61
## 6 cnk19a_cnk05_cnk66_cnk10_cnk20_cnk18_cnk61_cnk56_cnk38
“Short-time group” users use machines in random way. They start mostly on machines: ‘20’, ‘05’, ‘10’ and ends on ‘56’ or ‘38’. Also they use machines ‘66’, ‘61’, ‘18’, ‘20’ in random order. In “Second middle-time group” station ‘cnk61’ is almost not used by group members. On the other hand cnk20 and cnk38 is used very often. Those people uses also cnk66 and cnk05. In “Group of long players” visitors starts mostly on ‘cnk10’ or ‘cnk19’ machine and then they play on ‘cnk61’, ‘cnk20’, ‘cnk18’ and ‘cnk56’ machines. At the end they finish mostly on ’cnk38’.